package com.icbc.processor;

import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 用户爬虫处理类
 */
@Component
public class UserPageProcessor implements PageProcessor {

    //站点设置
    private Site site = Site.me()
            .setRetryTimes(1) //重试次数
            .setSleepTime(5000)//休眠时间
            .setTimeOut(10000);//超时时间


    @Override
    public void process(Page page) {
        //递归爬取网页
        page.addTargetRequests(page.getHtml().regex("https://my.oschina.net/u/[0-9]+/blog/[0-9]+").all());

        //获取内容
        String nickname = page.getHtml().xpath("//*[@id=\"mainScreen\"]/div/div[1]/div/div[2]/div[1]/div[2]/div[1]/div[1]/a/span/text()").toString();
        String avatar = page.getHtml().xpath("//*[@id=\"mainScreen\"]/div/div[1]/div/div[2]/div[1]/div[2]/div[1]/div[1]/a/div").css("img","src").toString();

        if (StringUtils.isNotBlank(nickname)){
            page.putField("nickname", nickname.trim());
            if(StringUtils.isNotBlank(avatar)){
                page.putField("avatar", avatar.trim());
            }
        }else {
            page.setSkip(true);
        }
    }

    @Override
    public Site getSite() {
        return site;
    }


}
